The goal of this project is to analyze time series collections of achievement histories of a sample of gamers on Xbox’s TrueAchievements website. Using this we can answer questions such as whether we can forecast engagement.
Data scraped from Xbox TrueAchievements using Python with the details in the full report. Webscraping Methodology
directory_df = create_file_directory()
directory_df = directory_transformations(directory_df)
lb_df = read.csv("./data/leaderboard/leaderboard.csv")
lb_df = lb_feature_transformations(lb_df)
achievements_manifest = read.csv("./data/manifest/achievements_manifest.csv")
set.seed(196)
rnd_gamer_sample = sample_random_gamers(200, directory_df = directory_df)
rnd_gamer_sample = lapply(rnd_gamer_sample, function(x) x[order(rnd_gamer_sample[[3]])])
rnd_gamer_sample[[1]] = achievement_transform_today(rnd_gamer_sample[[1]], directory_df)
rnd_gamer_sample[[1]] = achievement_transform_yesterday(rnd_gamer_sample[[1]], directory_df)
rnd_gamer_sample[[1]] = achievement_transform_drop_offline(rnd_gamer_sample[[1]])
rnd_gamer_sample[[1]] = achievement_transform_format_dates(rnd_gamer_sample[[1]])
rnd_gamer_sample[[1]] = achievement_transform_extract_dates(rnd_gamer_sample[[1]])
rnd_gamer_sample[[2]] = games_transform_drop_bad_titles(rnd_gamer_sample[[2]])
rnd_gamer_sample[[2]] = games_transform_hours(rnd_gamer_sample[[2]])
print(paste("TOTAL OBSERVATIONS:", get_total_observations(rnd_gamer_sample[[1]])))
## [1] "TOTAL OBSERVATIONS: 380929"
metrics_df = process_metrics_df(rnd_gamer_sample, directory_df)
frequency_dfs = achievement_calculate_frequencies(rnd_gamer_sample)
frequency_combined_df = bind_rows(frequency_dfs, .id = "data_frame_id")
frequency_combined_df$data_frame_id = as.numeric(frequency_combined_df$data_frame_id)
da_df = calculate_daily_achievements(frequency_combined_df)
da_df = da_fill_dates(da_df)
da_profiles = da_split_by_profile(da_df)
da_profiles = da_profiles_set_churn(da_profiles)
## [1] "PROFILE: 145 DROPPED (All NA values)"
da_profiles = da_profiles_set_days_existence(da_profiles)
da_profiles = calculate_daily_lt_eir(da_profiles)
da_profiles = calculate_weekly_eir_all(da_profiles)
da_profiles = calculate_monthly_eir_all(da_profiles)
plot_lb_range_interactive(lb_df, "Score", 0, 4000000, 1000000)
# Plot histogram of churned with different colors for TRUE, FALSE, and NA
ggplot(metrics_df, aes(x = churned, fill = factor(churned))) +
geom_bar(color = "white") +
scale_fill_manual(values = c("darkgreen", "darkred", "gray")) +
labs(title = "Churned Histogram (365 Days Since Last Achievement)", x = "Churned Status", y = "Count")
ggplot(metrics_df, aes(x = longest_streak, fill = factor(longest_streak))) +
geom_bar(color = "white") +
labs(title = "Streak Histogram", x = "Longest Streak (in Days)", y = "Count")
# Create the box plot for game time
ggplot(metrics_df, aes(x = "", y = total_game_time_minutes / 60, fill = "Game Time")) +
geom_boxplot(width = 0.5, position = position_dodge(width = 0.9), color = "black", outlier.color = "darkred", outlier.shape = 16, outlier.size = 3) +
labs(x = "", y = "Game Time (Hours)", fill = "") +
scale_fill_manual(values = "#FF7F00") +
theme(legend.position = "top", legend.title = element_blank()) +
scale_y_continuous(labels = scales::comma) +
coord_flip()
# Create the box plot
ggplot(metrics_df[metrics_df$total_app_time_minutes > 0,], aes(x = "", y = total_app_time_minutes / 60, fill = "App Time")) +
geom_boxplot(width = 0.5, position = position_dodge(width = 0.9), color = "black", outlier.color = "darkblue", outlier.shape = 16, outlier.size = 3) +
labs(x = "", y = "App Time (Hours)", fill = "", caption = paste("Number of Zero Values Filtered Out:", sum(metrics_df$total_app_time_minutes == 0))) +
scale_fill_manual(values = "#1F78B4") +
theme(legend.position = "top", legend.title = element_blank()) +
scale_y_continuous(labels = scales::comma) +
coord_flip()
ggplot(metrics_df, aes(x = total_game_time_minutes / 60, y = total_app_time_minutes / 60, color = total_app_time_minutes / 60)) +
geom_point() +
labs(x = "Total Game Time (Hours)", y = "Total App Time (Hours)", color = "Total App Time (Hours)") +
scale_color_gradient(low = "blue", high = "red") +
ggtitle("Total Time: Game vs App (Hours)") +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma)
ts_profiles = da_profiles_ts_decomp(da_profiles)
## [1] "Insufficient data for profile 12 - skipping decomposition."
## [1] "Insufficient data for profile 24 - skipping decomposition."
## [1] "Insufficient data for profile 27 - skipping decomposition."
## [1] "Insufficient data for profile 55 - skipping decomposition."
## [1] "Insufficient data for profile 56 - skipping decomposition."
## [1] "Insufficient data for profile 113 - skipping decomposition."
## [1] "Insufficient data for profile 120 - skipping decomposition."
## [1] "Insufficient data for profile 157 - skipping decomposition."
## [1] "Insufficient data for profile 170 - skipping decomposition."
## [1] "Insufficient data for profile 193 - skipping decomposition."
## [1] "Insufficient data for profile 194 - skipping decomposition."
## [1] "Insufficient data for profile 199 - skipping decomposition."
# Define UI
ui <- fluidPage(
titlePanel("Time Series Decomposition Plots"),
sidebarLayout(
sidebarPanel(
selectInput("profile", "Select Profile:", choices = seq_along(ts_profiles), selected = ts_profiles[[1]], width = "25%")
),
mainPanel(
plotOutput("plot1"),
plotOutput("plot2"),
plotOutput("plot3"),
plotOutput("plot4")
)
)
)
# Define server
server <- function(input, output) {
output$plot1 <- renderPlot({
profile <- ts_profiles[[as.numeric(input$profile)]]
plot_data <- data.frame(
date = time(profile$ts[[1]]),
stringsAsFactors = FALSE
)
plot_data$original1 <- profile$ts[[1]]
plot_data$trend1 <- profile$decomposition[[1]][["trend"]]
plot_data$seasonal1 <- profile$decomposition[[1]][["seasonal"]]
plot_data$residual1 <- profile$decomposition[[1]][["random"]]
ggplot(plot_data, aes(x = date)) +
geom_line(aes(y = original1, color = "Original")) +
geom_line(aes(y = trend1, color = "Trend")) +
geom_line(aes(y = seasonal1, color = "Seasonal")) +
geom_line(aes(y = residual1, color = "Residual")) +
labs(x = "Date", y = "Value", color = "Component") +
scale_color_manual(values = c("Original" = "black", "Trend" = "blue",
"Seasonal" = "red", "Residual" = "green")) +
facet_wrap(~ "Time Series 1: Daily Lifetime EIR", ncol = 1) +
theme_minimal()
})
output$plot2 <- renderPlot({
profile <- ts_profiles[[as.numeric(input$profile)]]
plot_data <- data.frame(
date = time(profile$ts[[2]]),
stringsAsFactors = FALSE
)
plot_data$original2 <- profile$ts[[2]]
plot_data$trend2 <- profile$decomposition[[2]][["trend"]]
plot_data$seasonal2 <- profile$decomposition[[2]][["seasonal"]]
plot_data$residual2 <- profile$decomposition[[2]][["random"]]
ggplot(plot_data, aes(x = date)) +
geom_line(aes(y = original2, color = "Original")) +
geom_line(aes(y = trend2, color = "Trend")) +
geom_line(aes(y = seasonal2, color = "Seasonal")) +
geom_line(aes(y = residual2, color = "Residual")) +
labs(x = "Date", y = "Value", color = "Component") +
scale_color_manual(values = c("Original" = "black", "Trend" = "blue",
"Seasonal" = "red", "Residual" = "green")) +
facet_wrap(~ "Time Series 2: Weekly EIR", ncol = 1) +
theme_minimal()
})
output$plot3 <- renderPlot({
profile <- ts_profiles[[as.numeric(input$profile)]]
plot_data <- data.frame(
date = time(profile$ts[[3]]),
stringsAsFactors = FALSE
)
plot_data$original3 <- profile$ts[[3]]
plot_data$trend3 <- profile$decomposition[[3]][["trend"]]
plot_data$seasonal3 <- profile$decomposition[[3]][["seasonal"]]
plot_data$residual3 <- profile$decomposition[[3]][["random"]]
ggplot(plot_data, aes(x = date)) +
geom_line(aes(y = original3, color = "Original")) +
geom_line(aes(y = trend3, color = "Trend")) +
geom_line(aes(y = seasonal3, color = "Seasonal")) +
geom_line(aes(y = residual3, color = "Residual")) +
labs(x = "Date", y = "Value", color = "Component") +
scale_color_manual(values = c("Original" = "black", "Trend" = "blue",
"Seasonal" = "red", "Residual" = "green")) +
facet_wrap(~ "Time Series 3: Monthly EIR", ncol = 1) +
theme_minimal()
})
output$plot4 <- renderPlot({
profile <- ts_profiles[[as.numeric(input$profile)]]
plot_data <- data.frame(
date = time(profile$ts[[4]]),
stringsAsFactors = FALSE
)
plot_data$original4 <- profile$ts[[4]]
plot_data$trend4 <- profile$decomposition[[4]][["trend"]]
plot_data$seasonal4 <- profile$decomposition[[4]][["seasonal"]]
plot_data$residual4 <- profile$decomposition[[4]][["random"]]
ggplot(plot_data, aes(x = date)) +
geom_line(aes(y = original4, color = "Original")) +
geom_line(aes(y = trend4, color = "Trend")) +
geom_line(aes(y = seasonal4, color = "Seasonal")) +
geom_line(aes(y = residual4, color = "Residual")) +
labs(x = "Date", y = "Value", color = "Component") +
scale_color_manual(values = c("Original" = "black", "Trend" = "blue",
"Seasonal" = "red", "Residual" = "green")) +
facet_wrap(~ "Time Series 4: Days Since Achievement Earned", ncol = 1) +
theme_minimal()
})
}
# Run the Shiny app
shinyApp(ui = ui, server = server)
ts_profiles <- ts_profiles %>% keep(~ !is.null(.))
# Initialize empty lists for data and dtrain
t1_data_list <- list()
t1_dtrain_list <- list()
t2_data_list = list()
t2_dtrain_list = list()
t3_data_list = list()
t3_dtrain_list = list()
t4_data_list = list()
t4_dtrain_list = list()
t5_data_list = list()
t5_dtrain_list = list()
for (i in 1:length(ts_profiles)) {
for (j in 1:5) { # Loop over target variables (j = 1 for ts[[1]], j = 2 for ts[[2]])
# Extract the target variable (daily_lt_eir) and create lagged variables as features
target <- as.vector(ts_profiles[[i]][["ts"]][[j]])
# Add a small constant to handle zero values and apply log transformation
#target_transformed <- log(target + 1e-6)
lag_1day = lag(target, 1)
lag_1week = lag(target, 7)
lag_2week = lag(target, 14)
lag_1month = lag(target, 28)
year = ts_profiles[[i]][["profile"]][["year"]]
month.x = ts_profiles[[i]][["profile"]][["month.x"]]
day_of_year = ts_profiles[[i]][["profile"]][["day_of_year"]]
week = ts_profiles[[i]][["profile"]][["week"]]
# Combine the features and target into a data frame
data <- data.frame(target, year, month.x, day_of_year, week, lag_1day, lag_1week, lag_2week, lag_1month)
data <- na.omit(data) # Remove rows with missing values
# Convert the data to DMatrix format
dtrain <- xgb.DMatrix(data = as.matrix(data[, -1]), label = data[, 1])
# Add the data and dtrain to their respective lists
if (j == 1) {
t1_data_list[[i]] <- data
t1_dtrain_list[[i]] <- dtrain
} else if (j == 2) {
t2_data_list[[i]] <- data
t2_dtrain_list[[i]] <- dtrain
} else if (j == 3) {
t3_data_list[[i]] <- data
t3_dtrain_list[[i]] <- dtrain
} else if (j == 4) {
t4_data_list[[i]] <- data
t4_dtrain_list[[i]] <- dtrain
} else if (j == 5) {
t5_data_list[[i]] <- data
t5_dtrain_list[[i]] <- dtrain
}
}
}
rm(data)
rm(dtrain)
t1_5fold_full = get_cv_folds(t1_data_list, 5)
t1_10fold_full = get_cv_folds(t1_data_list, 10)
t1_25fold_full = get_cv_folds(t1_data_list, 25)
# Assuming you have an xgb.DMatrix called "dtrain"
t1_params <- list(
objective = "reg:squarederror",
eval_metric = "rmse",
max_depth = 8,
eta = 0.1,
subsample = 0.8,
colsample_bytree = 0.8
)
# 100 BOOST ROUNDS
t1_5fold_100n_models = lapply(1:189, function(index) train_cv_target1_models(index, t1_5fold_full, t1_params, t1_data_list, t1_dtrain_list, 100))
t1_10fold_100n_models = lapply(1:189, function(index) train_cv_target1_models(index, t1_10fold_full, t1_params, t1_data_list, t1_dtrain_list, 100))
t1_25fold_100n_models = lapply(1:189, function(index) train_cv_target1_models(index, t1_25fold_full, t1_params, t1_data_list, t1_dtrain_list, 100))
# 1000 BOOST ROUNDS
#t1_5fold_1000n_models = lapply(1:189, function(index) train_cv_target1_models(index, t1_5fold_full, t1_params, t1_data_list, t1_dtrain_list, 1000))
#t1_10fold_1000n_models = lapply(1:189, function(index) train_cv_target1_models(index, t1_10fold_full, t1_params, t1_data_list, t1_dtrain_list, 1000))
#t1_25fold_1000n_models = lapply(1:189, function(index) train_cv_target1_models(index, t1_25fold_full, t1_params, t1_data_list, t1_dtrain_list, 1000))
# UI
ui <- fluidPage(
titlePanel("Evaluation Metrics"),
sidebarLayout(
sidebarPanel(
selectInput("profile", "Select Profile:", choices = c(1:189))
),
mainPanel(
plotOutput("t1_5fold_100n_metrics_plot"),
plotOutput("t1_10fold_100n_metrics_plot"),
plotOutput("t1_25fold_100n_metrics_plot"),
tableOutput("mean_metrics")
)
)
)
# Server
server <- function(input, output, session) {
output$t1_5fold_100n_metrics_plot <- renderPlot({
profile <- as.integer(input$profile)
# Create a data frame with the evaluation metrics for the selected profile
t1_5fold_100n_metrics_df <- data.frame(
Fold = 1:length(t1_5fold_100n_models[[profile]][[3]][[1]]),
RMSE = t1_5fold_100n_models[[profile]][[3]][[1]],
MAPE = t1_5fold_100n_models[[profile]][[3]][[2]],
SMAPE = t1_5fold_100n_models[[profile]][[3]][[3]]
)
# Plot the evaluation metrics
ggplot(t1_5fold_100n_metrics_df, aes(x = Fold)) +
geom_line(aes(y = RMSE, color = "RMSE"), size = 1) +
geom_line(aes(y = MAPE, color = "MAPE"), size = 1) +
geom_line(aes(y = SMAPE, color = "SMAPE"), size = 1) +
labs(title = paste("Evaluation Metrics (5 Fold) - Profile", profile),
x = "Fold",
y = "Value",
color = "Metric") +
scale_color_manual(values = c("RMSE" = "red", "MAPE" = "blue", "SMAPE" = "green")) +
theme_minimal()
})
output$t1_10fold_100n_metrics_plot <- renderPlot({
profile <- as.integer(input$profile)
# Create a data frame with the evaluation metrics for the selected profile
t1_10fold_100n_metrics_df <- data.frame(
Fold = 1:length(t1_10fold_100n_models[[profile]][[3]][[1]]),
RMSE = t1_10fold_100n_models[[profile]][[3]][[1]],
MAPE = t1_10fold_100n_models[[profile]][[3]][[2]],
SMAPE = t1_10fold_100n_models[[profile]][[3]][[3]]
)
# Plot the evaluation metrics
ggplot(t1_10fold_100n_metrics_df, aes(x = Fold)) +
geom_line(aes(y = RMSE, color = "RMSE"), size = 1) +
geom_line(aes(y = MAPE, color = "MAPE"), size = 1) +
geom_line(aes(y = SMAPE, color = "SMAPE"), size = 1) +
labs(title = paste("Evaluation Metrics (10 Fold) - Profile", profile),
x = "Fold",
y = "Value",
color = "Metric") +
scale_color_manual(values = c("RMSE" = "red", "MAPE" = "blue", "SMAPE" = "green")) +
theme_minimal()
})
output$t1_25fold_100n_metrics_plot <- renderPlot({
profile <- as.integer(input$profile)
# Create a data frame with the evaluation metrics for the selected profile
t1_25fold_100n_metrics_df <- data.frame(
Fold = 1:length(t1_25fold_100n_models[[profile]][[3]][[1]]),
RMSE = t1_25fold_100n_models[[profile]][[3]][[1]],
MAPE = t1_25fold_100n_models[[profile]][[3]][[2]],
SMAPE = t1_25fold_100n_models[[profile]][[3]][[3]]
)
# Plot the evaluation metrics
ggplot(t1_25fold_100n_metrics_df, aes(x = Fold)) +
geom_line(aes(y = RMSE, color = "RMSE"), size = 1) +
geom_line(aes(y = MAPE, color = "MAPE"), size = 1) +
geom_line(aes(y = SMAPE, color = "SMAPE"), size = 1) +
labs(title = paste("Evaluation Metrics (25 Fold) - Profile", profile),
x = "Fold",
y = "Value",
color = "Metric") +
scale_color_manual(values = c("RMSE" = "red", "MAPE" = "blue", "SMAPE" = "green")) +
theme_minimal()
})
output$mean_metrics <- renderTable({
profile <- as.integer(input$profile)
# Create a data frame with the mean metrics for the selected profile
mean_metrics_df <- data.frame(
Metric = rep(c("Mean RMSE", "Mean MAPE", "Mean SMAPE"), times = 3),
Value = c(
t1_5fold_100n_models[[profile]][[4]],
t1_5fold_100n_models[[profile]][[5]],
t1_5fold_100n_models[[profile]][[6]],
t1_10fold_100n_models[[profile]][[4]],
t1_10fold_100n_models[[profile]][[5]],
t1_10fold_100n_models[[profile]][[6]],
t1_25fold_100n_models[[profile]][[4]],
t1_25fold_100n_models[[profile]][[5]],
t1_25fold_100n_models[[profile]][[6]]
),
Fold = rep(c("5-fold", "10-fold", "25-fold"), each = 3)
)
mean_metrics_df
})
}
# Run the Shiny app
shinyApp(ui = ui, server = server)
# UI
ui <- fluidPage(
titlePanel("Evaluation Metrics"),
sidebarLayout(
sidebarPanel(
selectInput("profile", "Select Profile:", choices = c(1:189))
),
mainPanel(
plotOutput("t1_5fold_1000n_metrics_plot"),
plotOutput("t1_10fold_1000n_metrics_plot"),
plotOutput("t1_25fold_1000n_metrics_plot"),
tableOutput("mean_metrics")
)
)
)
# Server
server <- function(input, output, session) {
output$t1_5fold_1000n_metrics_plot <- renderPlot({
profile <- as.integer(input$profile)
# Create a data frame with the evaluation metrics for the selected profile
t1_5fold_1000n_metrics_df <- data.frame(
Fold = 1:length(t1_5fold_1000n_models[[profile]][[3]][[1]]),
RMSE = t1_5fold_1000n_models[[profile]][[3]][[1]],
MAPE = t1_5fold_1000n_models[[profile]][[3]][[2]],
SMAPE = t1_5fold_100n_models[[profile]][[3]][[3]]
)
# Plot the evaluation metrics
ggplot(t1_5fold_1000n_metrics_df, aes(x = Fold)) +
geom_line(aes(y = RMSE, color = "RMSE"), size = 1) +
geom_line(aes(y = MAPE, color = "MAPE"), size = 1) +
geom_line(aes(y = SMAPE, color = "SMAPE"), size = 1) +
labs(title = paste("Evaluation Metrics - Profile", profile),
x = "Fold",
y = "Value",
color = "Metric") +
scale_color_manual(values = c("RMSE" = "red", "MAPE" = "blue", "SMAPE" = "green")) +
theme_minimal()
})
output$t1_10fold_1000n_metrics_plot <- renderPlot({
profile <- as.integer(input$profile)
# Create a data frame with the evaluation metrics for the selected profile
t1_10fold_1000n_metrics_df <- data.frame(
Fold = 1:length(t1_10fold_1000n_models[[profile]][[3]][[1]]),
RMSE = t1_10fold_1000n_models[[profile]][[3]][[1]],
MAPE = t1_10fold_1000n_models[[profile]][[3]][[2]],
SMAPE = t1_10fold_1000n_models[[profile]][[3]][[3]]
)
# Plot the evaluation metrics
ggplot(t1_10fold_1000n_metrics_df, aes(x = Fold)) +
geom_line(aes(y = RMSE, color = "RMSE"), size = 1) +
geom_line(aes(y = MAPE, color = "MAPE"), size = 1) +
geom_line(aes(y = SMAPE, color = "SMAPE"), size = 1) +
labs(title = paste("Evaluation Metrics - Profile", profile),
x = "Fold",
y = "Value",
color = "Metric") +
scale_color_manual(values = c("RMSE" = "red", "MAPE" = "blue", "SMAPE" = "green")) +
theme_minimal()
})
output$t1_25fold_1000n_metrics_plot <- renderPlot({
profile <- as.integer(input$profile)
# Create a data frame with the evaluation metrics for the selected profile
t1_25fold_1000n_metrics_df <- data.frame(
Fold = 1:length(t1_25fold_1000n_models[[profile]][[3]][[1]]),
RMSE = t1_25fold_1000n_models[[profile]][[3]][[1]],
MAPE = t1_25fold_1000n_models[[profile]][[3]][[2]],
SMAPE = t1_25fold_1000n_models[[profile]][[3]][[3]]
)
# Plot the evaluation metrics
ggplot(t1_25fold_1000n_metrics_df, aes(x = Fold)) +
geom_line(aes(y = RMSE, color = "RMSE"), size = 1) +
geom_line(aes(y = MAPE, color = "MAPE"), size = 1) +
geom_line(aes(y = SMAPE, color = "SMAPE"), size = 1) +
labs(title = paste("Evaluation Metrics - Profile", profile),
x = "Fold",
y = "Value",
color = "Metric") +
scale_color_manual(values = c("RMSE" = "red", "MAPE" = "blue", "SMAPE" = "green")) +
theme_minimal()
})
output$mean_metrics <- renderTable({
profile <- as.integer(input$profile)
# Create a data frame with the mean metrics for the selected profile
mean_metrics_df <- data.frame(
Metric = c("Mean RMSE", "Mean RMSE", "Mean RMSE", "Mean MAPE", "Mean MAPE", "Mean MAPE", "Mean SMAPE", "Mean SMAPE", "Mean SMAPE"),
Value = c(
t1_5fold_1000n_models[[profile]][[4]],
t1_10fold_1000n_models[[profile]][[4]],
t1_25fold_1000n_models[[profile]][[4]],
t1_5fold_1000n_models[[profile]][[5]],
t1_10fold_1000n_models[[profile]][[5]],
t1_25fold_1000n_models[[profile]][[5]],
t1_5fold_1000n_models[[profile]][[6]],
t1_10fold_1000n_models[[profile]][[6]],
t1_25fold_1000n_models[[profile]][[6]]
),
Fold = rep(c("5-fold", "10-fold", "25-fold"), each = 3)
)
mean_metrics_df
})
}
# Run the Shiny app
shinyApp(ui = ui, server = server)